Fix: After training the model, subtract the effect of the mean expression from the model score
Problems:
Done with a linear model, so a bit of the effect remains
After the transformation you lose the probability interpretation of the score (translated it to have the same mean as the original distribution, but I’m not sure if that’s the right way to do it)
When training the model, the bias is still there, so perhaps it would be better to remove the effect somehow from the descriptive features and not from the resulting score
library(tidyverse)
library(knitr)
library(plotly) ; library(viridis) ; library(gridExtra) ; library(RColorBrewer)
library(Rtsne)
library(knitr)
library(ROCR)
library(expss)
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
# Regression data
load(file='./../Data/Gandal/logreg_model.RData')
# Mean Expression data
load('./../Data/Gandal/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
# Dataset created with DynamicTreeMerged algorithm
clustering_selected = 'DynamicHybridMergedSmall'
print(paste0('Using clustering ', clustering_selected))
## [1] "Using clustering DynamicHybridMergedSmall"
original_dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'), row.names=1)
rm(dds, datGenes, datMeta)
The linear trend is gone, but there still remains some other non-linear trends
# 1. Fit line to prob ~ meanExpr
linear_fit = lm(prob ~ meanExpr, data=plot_data)
slope = linear_fit$coefficients[['meanExpr']]
print(paste0('Slope: ', slope))
## [1] "Slope: 0.0174544683661941"
# 2. Remove mean expression signal
remove_slope = function(df, slope){
meanExpr = datExpr[rownames(df),] %>% rowMeans
if(!all(rownames(meanExpr)==rownames(df))){
print('Row names do not match!')
stop()
}
unbiased_score = df$prob + slope*(mean(meanExpr) - meanExpr) # This distribution has the same mean as the original one
return(unbiased_score)
}
negative_set$corrected_score = remove_slope(negative_set, slope)
test_set$corrected_score = remove_slope(test_set, slope)
plot_data = data.frame('ID'=rownames(datExpr), 'meanExpr'=rowMeans(datExpr)) %>%
right_join(negative_set %>% mutate(ID=rownames(negative_set)), by='ID')
plot_data %>% ggplot(aes(meanExpr, corrected_score)) + geom_point(alpha=0.1, color='#0099cc') +
geom_smooth(method='loess', color='gray', alpha=0.3) + ylab('Corrected Score') + xlab('Mean Expression') +
geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) +
theme_minimal() + ggtitle('Mean expression vs model score corrected by linear fit by gene')
rm(slope, linear_fit, remove_slope)
Confusion Matrix
conf_mat = test_set %>% mutate(corrected_pred = corrected_score>0.5) %>%
apply_labels(SFARI = 'Actual Labels',
corrected_score = 'Corrected Score',
corrected_pred = 'Corrected Label Prediction')
cro(conf_mat$SFARI, list(conf_mat$pred, total()))
|  FALSE |  TRUE |  #Total | |
|---|---|---|---|
|  Actual Labels | |||
| Â Â Â FALSEÂ | 101 | 64 | 165 |
| Â Â Â TRUEÂ | 64 | 101 | 165 |
|    #Total cases | 165 | 165 | 330 |
rm(conf_mat)
Accuracy
Accuracy went up? It doesn’t make sense, maybe it’s because it’s a small sample
acc = mean(test_set$SFARI==(test_set$corrected_score>0.5))
print(paste0('Accuracy = ', round(acc,4)))
## [1] "Accuracy = 0.6152"
rm(acc)
ROC Curve
AUC decreased 0.01
pred_ROCR = prediction(test_set$corrected_score, test_set$SFARI)
roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
AUC = performance(pred_ROCR, measure='auc')@y.values[[1]]
plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(AUC,2),')'), col='#009999')
abline(a=0, b=1, col='#666666')
rm(roc_ROCR, AUC)
Lift Curve
lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')
rm(lift_ROCR, pred_ROCR)
Looks very similar to before
plot_data = test_set %>% dplyr::select(corrected_score, SFARI)
plot_data %>% ggplot(aes(corrected_score, fill=SFARI, color=SFARI)) + geom_density(alpha=0.3) + xlab('Score') +
theme_minimal() + ggtitle('Model score distribution by SFARI Label')
The positive relation between SFARI scores and Model scores is still there
plot_data = test_set %>% mutate(ID=rownames(test_set)) %>% dplyr::select(ID, corrected_score) %>%
left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
dplyr::select(ID, corrected_score, gene.score) %>% apply_labels(gene.score='SFARI Gene score')
cro(plot_data$gene.score)
|  #Total | |
|---|---|
|  SFARI Gene score | |
| Â Â Â 1Â | 8 |
| Â Â Â 2Â | 11 |
| Â Â Â 3Â | 34 |
| Â Â Â 4Â | 76 |
| Â Â Â 5Â | 32 |
| Â Â Â 6Â | 4 |
|    None | 165 |
|    #Total cases | 330 |
mean_vals = plot_data %>% group_by(gene.score) %>% summarise(mean_corrected_score = mean(corrected_score))
plot_data %>% ggplot(aes(corrected_score, color=gene.score, fill=gene.score)) + geom_density(alpha=0.25) +
geom_vline(data=mean_vals, aes(xintercept=mean_corrected_score, color=gene.score), linetype='dashed') +
scale_colour_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) +
ggtitle('Distribution of the Model scores by SFARI score') +
xlab('Corrected Score') + ylab('Density') + theme_minimal()
ggplotly(plot_data %>% ggplot(aes(gene.score, corrected_score, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) +
ggtitle('Distribution of the Model scores by SFARI score') +
xlab('SFARI score') + ylab('Model score') + theme_minimal())
rm(mean_vals)
Print genes with highest corrected scores in test set
First gene no longer has score of 1, so it probably was there because of its high level of expression
Not a single gene with score 6
Very few genes without a SFARI score
test_set %>% dplyr::select(corrected_score, SFARI) %>% mutate(ID = rownames(test_set)) %>%
arrange(desc(corrected_score)) %>% top_n(50, wt=corrected_score) %>%
left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
dplyr::select(ID, corrected_score, SFARI, gene.score, MTcor, matches(clustering_selected)) %>%
kable
| ID | corrected_score | SFARI | gene.score | MTcor | DynamicHybridMergedSmall |
|---|---|---|---|---|---|
| ENSG00000152217 | 0.8427161 | TRUE | 3 | 0.2020738 | #00B70C |
| ENSG00000139767 | 0.8346415 | TRUE | 5 | -0.5362834 | #00C0BB |
| ENSG00000127124 | 0.8218297 | TRUE | 3 | 0.2020738 | #00B70C |
| ENSG00000162105 | 0.8193478 | TRUE | 2 | -0.5362834 | #00C0BB |
| ENSG00000118058 | 0.8177748 | TRUE | 1 | 0.2020738 | #00B70C |
| ENSG00000147010 | 0.8147129 | TRUE | 5 | 0.2020738 | #00B70C |
| ENSG00000119866 | 0.8018502 | TRUE | 2 | -0.5362834 | #00C0BB |
| ENSG00000049618 | 0.7997779 | TRUE | 1 | 0.2020738 | #00B70C |
| ENSG00000183117 | 0.7986013 | TRUE | 4 | -0.9216750 | #C17FFF |
| ENSG00000125851 | 0.7924033 | FALSE | None | -0.6557529 | #53B400 |
| ENSG00000128815 | 0.7896276 | TRUE | 3 | 0.6236224 | #00BF7D |
| ENSG00000169057 | 0.7872776 | TRUE | 2 | -0.6714074 | #8195FF |
| ENSG00000133216 | 0.7850113 | TRUE | 4 | 0.4840783 | #00B6EB |
| ENSG00000170579 | 0.7754854 | TRUE | 3 | -0.9216750 | #C17FFF |
| ENSG00000148737 | 0.7656936 | TRUE | 3 | -0.5362834 | #00C0BB |
| ENSG00000116539 | 0.7642164 | TRUE | 1 | 0.2020738 | #00B70C |
| ENSG00000197892 | 0.7563702 | TRUE | 4 | 0.2020738 | #00B70C |
| ENSG00000185345 | 0.7550436 | TRUE | 3 | -0.9216750 | #C17FFF |
| ENSG00000176697 | 0.7491887 | TRUE | 5 | 0.4840783 | #00B6EB |
| ENSG00000082701 | 0.7485968 | TRUE | 5 | -0.5362834 | #00C0BB |
| ENSG00000163635 | 0.7483507 | TRUE | 5 | 0.4727632 | #F365E6 |
| ENSG00000197694 | 0.7457768 | FALSE | None | -0.5362834 | #00C0BB |
| ENSG00000133958 | 0.7456077 | TRUE | 3 | 0.2020738 | #00B70C |
| ENSG00000170043 | 0.7433856 | FALSE | None | -0.5362834 | #00C0BB |
| ENSG00000126705 | 0.7430874 | TRUE | 3 | -0.5362834 | #00C0BB |
| ENSG00000089012 | 0.7397676 | FALSE | None | 0.4840783 | #00B6EB |
| ENSG00000121297 | 0.7393241 | TRUE | 4 | 0.2020738 | #00B70C |
| ENSG00000105409 | 0.7326351 | TRUE | 4 | -0.5362834 | #00C0BB |
| ENSG00000071242 | 0.7317059 | TRUE | 4 | -0.5362834 | #00C0BB |
| ENSG00000130477 | 0.7261053 | TRUE | 4 | -0.9216750 | #C17FFF |
| ENSG00000074054 | 0.7260603 | TRUE | 3 | -0.4966667 | #00C1A8 |
| ENSG00000157014 | 0.7223243 | FALSE | None | -0.5362834 | #00C0BB |
| ENSG00000172292 | 0.7217194 | FALSE | None | -0.9216750 | #C17FFF |
| ENSG00000112640 | 0.7145476 | TRUE | 4 | -0.6557529 | #53B400 |
| ENSG00000198752 | 0.7076024 | TRUE | 3 | -0.5362834 | #00C0BB |
| ENSG00000173809 | 0.7071743 | FALSE | None | 0.3841921 | #E88523 |
| ENSG00000170485 | 0.7009354 | TRUE | 4 | -0.5362834 | #00C0BB |
| ENSG00000132821 | 0.6990407 | FALSE | None | -0.6714074 | #8195FF |
| ENSG00000156110 | 0.6976490 | TRUE | 4 | -0.6714074 | #8195FF |
| ENSG00000101842 | 0.6967189 | FALSE | None | -0.8639848 | #C49A00 |
| ENSG00000139910 | 0.6955391 | FALSE | None | -0.9216750 | #C17FFF |
| ENSG00000136925 | 0.6954479 | FALSE | None | 0.4840783 | #00B6EB |
| ENSG00000117009 | 0.6933030 | TRUE | 5 | 0.4727632 | #F365E6 |
| ENSG00000107105 | 0.6913609 | TRUE | 4 | -0.9216750 | #C17FFF |
| ENSG00000136531 | 0.6906225 | TRUE | 1 | -0.9216750 | #C17FFF |
| ENSG00000135365 | 0.6869061 | TRUE | 4 | 0.2020738 | #00B70C |
| ENSG00000187555 | 0.6831712 | TRUE | 2 | -0.6557529 | #53B400 |
| ENSG00000137075 | 0.6816073 | TRUE | 4 | -0.5362834 | #00C0BB |
| ENSG00000100354 | 0.6782077 | TRUE | 2 | 0.4843988 | #8EAB00 |
| ENSG00000083544 | 0.6765330 | FALSE | None | 0.2020738 | #00B70C |
The top scores were affected the most (decreasing)
negative_set %>% ggplot(aes(prob, corrected_score)) + geom_point(alpha=0.2, color='#0099cc') +
geom_abline(slope=1, intercept=0, color='#e6e6e6', linetype='dashed') +
geom_smooth(color='#666666', alpha=0.5, se=FALSE, size=0.5) + coord_fixed() +
xlab('Original score') + ylab('Corrected score') + theme_minimal()
More genes are predicted as true as before (4975 before)
negative_set_table = negative_set %>% mutate(corrected_pred = corrected_score>0.5) %>%
apply_labels(corrected_score = 'Corrected Probability',
corrected_pred = 'Corrected Label Prediction')
cro(negative_set_table$corrected_pred)
|  #Total | |
|---|---|
|  Corrected Label Prediction | |
| Â Â Â FALSEÂ | 8966 |
| Â Â Â TRUEÂ | 5073 |
|    #Total cases | 14039 |
cat(paste0('\n', sum(negative_set$corrected_pred), ' genes are predicted as ASD-related'))
##
## 0 genes are predicted as ASD-related
The relation is not as strong as before in the highest scores, but it became stronger in the lowest scores, probably because they have a higher level of expression than the average gene, so their scores may have been increased.
This could mean that a linear fit is not the right way to fix this
negative_set %>% ggplot(aes(corrected_score, GS, color=MTcor)) + geom_point() + geom_smooth(method='loess', color='#666666') +
geom_line(stat='smooth', method='loess', color='#666666', alpha=0.5, size=1.2, aes(x=prob)) +
geom_hline(yintercept=mean(negative_set$GS), color='gray', linetype='dashed') +
scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) +
ggtitle('Relation between the Model\'s Corrected Score and Gene Significance') + theme_minimal()
Not as high as before in the highest scores, a bit higher on the lowest scores
The transparent verison of the trend line is the original trend line
negative_set %>% ggplot(aes(corrected_score, abs(GS), color=MTcor)) + geom_point() +
geom_hline(yintercept=mean(negative_set$absGS), color='gray', linetype='dashed') +
geom_smooth(method='loess', color='#666666') +
geom_line(stat='smooth', method='loess', color='#666666', alpha=0.4, size=1.2, aes(x=prob)) +
scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) +
ggtitle('Relation between the Model\'s Corrected Score and Gene Significance') + theme_minimal()
Summarised version of score vs mean expression, plotting by module instead of by gene
The difference in the trend lines between this plot and the one above is that the one above takes all the points into consideration while this considers each module as an observation by itself, so the top one is strongly affected by big modules and the bottom one treats all modules the same
The transparent version of each point and line are the original values and trends before the bias correction
plot_data = negative_set %>% group_by(MTcor) %>% summarise(mean = mean(prob), sd = sd(prob),
new_mean = mean(corrected_score), new_sd = sd(corrected_score), n = n()) %>%
mutate(MTcor_sign = ifelse(MTcor>0, 'Positive', 'Negative')) %>% left_join(original_dataset, by='MTcor') %>%
dplyr::select(matches(clustering_selected), MTcor, MTcor_sign, mean, new_mean, sd, new_sd, n) %>% distinct()
colnames(plot_data)[1] = 'ID'
ggplotly(plot_data %>% ggplot(aes(MTcor, new_mean, size=n, color=MTcor_sign)) + geom_point(aes(id=ID)) +
geom_smooth(method='loess', color='gray', se=FALSE) + geom_smooth(method='lm', se=FALSE) +
geom_point(aes(y=mean), alpha=0.3) + theme(legend.position='none') +
geom_line(stat='smooth', method='loess', color='gray', se=FALSE, alpha=0.3, size=1.2, aes(y=mean)) +
geom_line(stat='smooth', method='lm', se=FALSE, alpha=0.3, size=1.2, aes(y=mean)) +
xlab('Module-Diagnosis correlation') + ylab('Mean Corrected Score by the Model') + theme_minimal())
Check if correcting by gene also corrected by module
Yes, but just a bit
mean_and_sd = data.frame(ID=rownames(datExpr), meanExpr=rowMeans(datExpr), sdExpr=apply(datExpr,1,sd))
plot_data = negative_set %>% mutate(ID=rownames(negative_set)) %>% left_join(mean_and_sd, by='ID') %>%
left_join(original_dataset %>% mutate(ID=rownames(original_dataset)) %>%
dplyr::select(ID, matches(clustering_selected)), by='ID')
colnames(plot_data)[ncol(plot_data)] = 'Module'
plot_data2 = plot_data %>% group_by(Module) %>% summarise(meanExpr = mean(meanExpr), meanProb = mean(prob),
new_meanProb = mean(corrected_score), n=n())
ggplotly(plot_data2 %>% ggplot(aes(meanExpr, new_meanProb, size=n)) +
geom_point(color=plot_data2$Module) + geom_point(color=plot_data2$Module, alpha=0.3, aes(y=meanProb)) +
geom_smooth(method='loess', se=TRUE, color='gray', alpha=0.1, size=0.7) +
geom_line(stat='smooth', method='loess', se=TRUE, color='gray', alpha=0.4, size=1.2, aes(y=meanProb)) +
theme_minimal() + theme(legend.position='none') +
ggtitle('Mean expression vs corrected Model score by Module'))
rm(plot_data2, mean_and_sd)
The relation between SD and score became bigger than before (???)
plot_data %>% filter(sdExpr<0.5) %>% ggplot(aes(sdExpr, corrected_score)) + geom_point(alpha=0.1, color='#0099cc') +
geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) +
geom_line(stat='smooth', method='lm', color='#999999', se=FALSE, alpha=0.4, size=1.5, aes(y=prob)) +
theme_minimal() + ggtitle('SD vs model probability by gene')
This approximation curve looks like the opposite of the trend found between mean/sd and model scores
plot_data %>% ggplot(aes(meanExpr, sdExpr)) + geom_point(alpha=0.1, color='#0099cc') +
geom_smooth(method='loess', color='gray', alpha=0.3) +
geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) +
scale_x_log10() + scale_y_log10() +
theme_minimal() + ggtitle('Mean expression vs SD by gene')
This relation improved a lot, so the model seems to be getting better
plot_data = negative_set %>% mutate(ID=rownames(negative_set)) %>%
left_join(DE_info %>% mutate(ID=rownames(DE_info)), by='ID')
plot_data %>% filter(abs(log2FoldChange)<10) %>%
ggplot(aes(log2FoldChange, corrected_score)) + geom_point(alpha=0.1, color='#0099cc') +
geom_smooth(method='loess', color='gray', alpha=0.3) +
geom_line(stat='smooth', method='loess', color='gray', alpha=0.3, size=1.5, aes(y=prob)) +
theme_minimal() + ggtitle('lfc vs model probability by gene')
Conclusion: This bias correction seems to be working up to a certain level, but there are still some non-linear patterns that remain and the relation with the SD that seems to have increased